#Input--a formatted Lexus-Nexus file
#Output--removes stories with duplicate headlines
#within 50 stories
#the no dups file is produced in the working folder

##
##  Programming supported by National Science Foundation Political Science Program Grant
##  SES-0719634 "Improving the Efficiency of Militarized Interstate Dispute Data Collection using
##  Automated Textual Analysis" and SES-0924240, "MID4: Updating the Militarized Dispute Data Set
##  2002-2010."

#!/usr/local/bin/perl

# ======== globals =========== #
$dups=0; #number of dups, initially none
$lens=50; #search for dups within 50 stories below

# ======== main program =========== #

$inputPrefix = $ARGV[0];  # get input file prefix

if (length($inputPrefix) == 0) {
  print "\aPlease enter Lexus-Nexus formatted input file prefix and re-run program\n";
  exit;
}

#open input, output, skip file
$inputFile = $inputPrefix.".genfed";

open(FDIR,$inputFile) or die "Can\'t open input file $inputFile; error $!";

$outfile = $inputPrefix.".NoDups";   # no dups output file
$dupFile = $inputPrefix.".NoDupStories"; #file containing no dup stories
$headFile = $inputPrefix."AllHeaders"; #file containing all headers
$dupf = $inputPrefix.".Dups"; #file containing dups only once

open (FOUT, ">$outfile")   or die "Can\'t open file $outfile; error $!";
open (FDUP, ">$dupFile")   or die "Can\'t open file $dupFile error $!";
open (ALL, ">$headFile")   or die "Can\'t open file $headFile error $!";
open (DUPS, ">$dupf")   or die "Can\'t open file $dupf error $!";

$ka = 0;
@headlines=();
$line = <FDIR>;
  while (!eof) {
  if ($line =~ m/---------------------------------------------------------------/) {
    $line = <FDIR>; #gets the headline
    push(@headlines,$line); #make an array of all headlines
    if (0 == (++$ka %10000)) { print "$ka  $line";}
  #  print FOUT $line;
   }

$line = <FDIR>;
} #while loop
$arl= @headlines; #get how many headlines
close(FDIR); #close nnfoutfile.txt Lexus-Nexus file

print ALL @headlines; #save all headlines for checking
close(ALL)  or die "Can\'t close file $headFile error $!";

$now=0; #where are we now in the array.
#change below number
while ($now<$arl) { #loop through the array to see where dups are and remove them
print $now."\n";
  $c=@headlines[$now];
  $c =~ s/\s$//g;

for ($i=1; $i<$lens; $i++) { #check 50 stories from a given one
 $n=@headlines[$now+$i];
 $n =~ s/\s$//g; #remove end whites
# print "i: ".$i." ".$c."----".$n."\n";
if ($c eq $n) { #if equals found remove dup from array
                   splice(@headlines, $now+$i, 1 );
                  $arl= @headlines;
                  print "Equal found ".$c."==".$n.length ar.$arl;
    } #if
 } #for check 50

#last check of last two items
if (@headlines[$now-1] eq @headlines[$now]) {
     $d=@headlines[$now];
    chomp($d);
     print DUPS "\'".$d."\'\n"; #print dup file for grep
                  splice(@headlines, $now, 1 );
                  $arl= @headlines;
                  #print "Equal found ".$c."==".$n.length
} #last recs check

$now++; #go to the next headline
} #while

print FOUT @headlines; #saves headlines for testing
close(FOUT) or die "Can\'t close file $outfile error $!";
close(DUPS)  or die "Can\'t close file $dupf error $!";

#from the no-dup array extract only the first story to match
#no dups file already contains unique headlines
#open formatted news file
 open(FDIR,$inputFile) or die "Can\'t open input file $outfile; error $!";
 $line = <FDIR>; #get first line-----------
 $line = <FDIR>; #get a headline from formatted file
 $arl= @headlines; #how many unique
print "Found $arl no duplicate headlines\n";
$i=0; #start from first line
#for ($i=0; $i<$arl; $i++) {
   $storyFound=0; #no stories found at first
   print FDUP "\n---------------------------------------------------------------\n";
   print FDUP $line; #output first matched headline and line

   while ($i<$arl) { #CHECK here, first story always a match???!!!!
    print "Line $line matching with ".@headlines[$i]. "\n";
     if ($line eq @headlines[$i])  {
         #found match, grab story and output
        print "Processing headline # $i--$line\n";
         $storyFound =1;
         if ($i>0) {print FDUP $line;} #output first matched headline
         #loop until you reach the beginning of a new story
         while ($line !~ m/---------------------------------------------------------------/) {
             $line = <FDIR>; #grab another one
           print FDUP $line;
              if (eof) {last;}
            } #get a non-duplicate story
            $i++; #get next headline from array
          } #if
     if (eof) {last;}
     $line = <FDIR>; #grab another line
    $storyFound=0;
  } #while loop

  #} #for array loop

 close(FDIR);
#close(FOUT);
close(FDUP);

print "\n\aFinished: $arl headlines processed in total.\n";
exit;
